Data WorkFlow

Author

Filippo Dall’Olio

Pakages

First, I load the required packages.

Code
devtools::install_github("f-dallolio/adtabler")
1
library(tidyverse, quietly = TRUE)
library(tsibble, quietly = TRUE)
library(rlang, quietly = TRUE)
library(glue, quietly = TRUE)
library(adtabler, quietly = TRUE)
library(furrr)

Data Info

Define the directory containing the AdIntel folders and retrieve data paths.

Code
adintel_dir_home <- "/mnt/sata_data_1/adintel"

adintel_folder <- list.files(adintel_dir_home, full.names = TRUE) |>
  stringr::str_replace_all("//", "/")

adintel_all_files_old <- list.files(adintel_dir_home,
  full.names = TRUE,
  recursive = TRUE
)

adintel_all_files_new <- adintel_all_files_old |>
  stringr::str_replace_all(" ", "_") |>
  stringr::str_replace_all("//", "/")
Code
file.rename(from = adintel_all_files_old, to = adintel_all_files_new) 
Code
adintel_all_files <- list.files(adintel_dir_home,
  full.names = TRUE,
  recursive = TRUE
)

Info - Dynamic Files

Retrieve file paths for 2020 “dynamic” files (i.e. they can change year by year):

Code
adintel_files_dyn <- adintel_all_files |>
  str_subset(pattern = no_case("master_file"), negate = TRUE) |>
  as_tibble_col("full_file_name") |>
  mutate(
    year = as_numeric2(str_split_i(full_file_name, "/", -3)),
    file_type_data = str_split_i(full_file_name, "/", -2),
    file_name_data = str_split_i(full_file_name, "/", -1),
    file_type_std = rename_adintel(file_type_data),
    file_name_std = rename_adintel(str_split_i(file_name_data, "\\.", 1)),
    .before = full_file_name
  ) |>
  arrange(year, file_type_data, file_name_data)

Info - Static Files

Retrieve file paths for “static” files (i.e. they do not change year by year):

Code
adintel_files_static <- adintel_all_files |>
  str_subset(pattern = no_case("master_file")) |>
  str_subset(pattern = no_case("Latest")) |>
  str_subset(pattern = as.character(max(adintel_files_dyn$year))) |>
  as_tibble_col("full_file_name") |>
  mutate(
    year = NA,
    file_type_data = "References",
    file_name_data = str_split_i(full_file_name, "/", -1),
    file_type_std = rename_adintel(file_type_data),
    file_name_std = rename_adintel(str_split_i(file_name_data, "\\.", 1)),
    .before = full_file_name
  ) |>
  arrange(file_type_data, file_name_data)

Data Checks

The function takes the full path of a file (full_file_name) and returns a data frame/tibble with the the number of columns (n_cols), the original column names (col_name_data) an their positions (col_pos), an the new column names standardized with the function adtabler::rename_adintel().

nested tibble with the name of the file (file), the string used (sep) to separate columns, the number of rows (n_rows), the number of columns (n_cols), and the column names (col_name_man). I use _man at the end of the column indicating the column names in the original files to indicate that it is the name used in the Adintel _manual. The column col_name_std contains standardized column names. Standardization is performed by the function adtabler::rename_adintel().

Code
fn_col_info <- function(file) {
  
  x <- file
  
  # load the first 100 rows of data
  df <- data.table::fread(
    file = x,
    nrows = 100
  )
  
  # retrieve column names
  col_name_data <- names(df)
  
  # retrieve number of columns
  n_cols <- NCOL(df)
  
  # retrieve the position of columns in the original data
  col_pos <- seq_along(col_name_data)
  
  # standardize column names
  col_name_std <- rename_adintel(col_name_data)
  
  # return a tibble/data frame with:
  #  Number of columns (1), 
  #  original column names (3) and their positions (2),
  # standardized column names.
  tibble(
    n_cols,
    col_pos,
    col_name_data,
    col_name_std
  )
}

The data frame row_numbers below is a pre-calculated table that contains the number of rows for each file. The number of rows was efficiently computed with the read_nrows function.

Code
data("row_numbers")

data_info <- adintel_files_dyn |> 
  bind_rows(
    adintel_files_static
  ) |>
  mutate(
    data = full_file_name |> 
      map(
        .f = ~ fn_col_info(.x)
      )
  ) |>
  unnest(
    everything()
  ) |>
  inner_join(
    row_numbers
  ) |>
  select(
    - n_lines_w_header
  ) |>
  relocate(
    n_rows, .before = n_cols
  ) |>
  relocate(
    full_file_name, .after = last_col()
  )
Joining with `by = join_by(full_file_name)`
Code
data_info
Code
data("references_columns")
references_columns
Code
data("occurrences_columns")
occurrences_columns
Code
data("lookup_datatype")
lookup_datatype
Code
refer_occurr_info <- occurrences_columns |> 
  nest( unique_key_info = c(media_type_id, col_unique_key) ) |> 
  bind_rows(
    references_columns |> nest(unique_key_info = col_unique_key)
    ) |>
  left_join(
    lookup_datatype
  ) |>
  relocate(
    datatype_r, datatype_sql, .before = sql_precision
  ) |>
  nest(
    data_type_info = c(
      datatype_r, datatype_sql, 
      sql_precision, sql_scale, 
      datatype_man, datatype_std
    ),
    manual_info = c(col_name_man, description
    )
  )
Joining with `by = join_by(datatype_man)`
Code
data_info <- data_info |>
  nest(
    data_col_original = contains("_data"),
    data_file_info = c(year, n_rows, n_cols, full_file_name)
  ) |>
  left_join( 
    refer_occurr_info
  )
Joining with `by = join_by(file_name_std, col_pos, col_name_std)`
Code
data_info
Code
data_info_unnested <- data_info |> 
  unnest(c(unique_key_info, data_type_info, manual_info)) |> 
  unnest(c(data_col_original, data_file_info))
data_info_unnested
Code
rm(list = setdiff(ls(), c("data_info", "data_info_unnested")))
Code
unique_key <- data_info_unnested |> 
  summarise(col_unique_key = 
              list(unique(col_unique_key) |> 
                     adtabler::na_rm() |> 
                     paste0(collapse = ",")) |> 
              unlist(),
       .by = c(file_type_std, file_name_std, media_type_id)) 
unique_key
Code
unique_key_nested <- unique_key |>
  nest(.by = c(file_type_std, file_name_std),
       .key = "unique_key")
unique_key_nested
Code
file_info <- data_info |> 
  select(file_type_std, file_name_std, data_file_info, data_col_original) |> 
  unnest(everything()) |> 
  select(file_type_std,
         file_name_std,
         file_name_data,
         col_name_data,
         year : full_file_name) |> 
  nest(col_name_data = col_name_data) |> 
  mutate(col_name_data = col_name_data |> map(~ .x[[1]] |> paste(collapse = ",")) |> unlist()) 

file_info 
Code
file_info_nested <- file_info |> 
  nest(.by = c(file_type_std, file_name_std),
       .key = "col_info")
file_info_nested
Code
col_info <- data_info |> 
  select(file_type_std : col_name_std, data_type_info) |>
  unnest(everything()) |> 
  select(file_type_std,
         file_name_std,
         col_pos,
         col_name_std,
         datatype_r : datatype_std) |> 
  distinct() 

col_info
Code
col_info_nested <- col_info   |>
  nest(.by = c(file_type_std, file_name_std),
       .key = "col_info")
col_info_nested
Code
manual_info <- data_info |> 
  select(file_type_std, file_name_std, manual_info,data_type_info) |> 
  unnest(everything()) |> 
  select(file_type_std,
       file_name_std,
       c(contains("_man"), description)) |> 
  distinct() |>
  relocate(col_name_man, .before = datatype_man) 

manual_info
Code
manual_info_nested <- manual_info |>
  nest(.by = c(file_type_std, file_name_std),
       .key = "manual_info")
manual_info_nested
Code
data_info_list <-   list(
    unique_key = unique_key,
    file_info = file_info,
    col_info = col_info,
    manual_info = manual_info
  )
data_info_list
$unique_key
# A tibble: 51 × 4
   file_type_std file_name_std media_type_id col_unique_key                     
   <chr>         <chr>                 <int> <chr>                              
 1 occurrences   cinema                   27 ""                                 
 2 occurrences   cinema                   28 ""                                 
 3 occurrences   fsi_coupon               12 "ad_date,market_code,media_type_id…
 4 occurrences   internet                 25 ""                                 
 5 occurrences   internet                 26 ""                                 
 6 occurrences   magazine                  7 ""                                 
 7 occurrences   magazine                  8 ""                                 
 8 occurrences   network_tv                1 ""                                 
 9 occurrences   network_tv                4 ""                                 
10 occurrences   network_tv                2 ""                                 
# ℹ 41 more rows

$file_info
# A tibble: 354 × 8
   file_type_std file_name_std file_name_data  year n_rows n_cols full_file_name
   <chr>         <chr>         <chr>          <int>  <int>  <int> <chr>         
 1 impressions   imp_national… ImpNationalTV…  2010 1.70e7     48 /mnt/sata_dat…
 2 impressions   imp_national… ImpNationalTV…  2011 1.93e7     48 /mnt/sata_dat…
 3 impressions   imp_national… ImpNationalTV…  2012 1.99e7     48 /mnt/sata_dat…
 4 impressions   imp_national… ImpNationalTV…  2013 2.02e7     48 /mnt/sata_dat…
 5 impressions   imp_national… ImpNationalTV…  2014 2.10e7     48 /mnt/sata_dat…
 6 impressions   imp_national… ImpNationalTV…  2015 2.12e7     48 /mnt/sata_dat…
 7 impressions   imp_national… ImpNationalTV…  2016 2.12e7     48 /mnt/sata_dat…
 8 impressions   imp_national… ImpNationalTV…  2017 2.14e7     48 /mnt/sata_dat…
 9 impressions   imp_national… ImpNationalTV…  2018 2.18e7     48 /mnt/sata_dat…
10 impressions   imp_national… ImpNationalTV…  2019 2.18e7     48 /mnt/sata_dat…
# ℹ 344 more rows
# ℹ 1 more variable: col_name_data <chr>

$col_info
# A tibble: 515 × 10
   file_type_std file_name_std   col_pos col_name_std    datatype_r datatype_sql
   <chr>         <chr>             <int> <chr>           <chr>      <chr>       
 1 impressions   imp_national_tv       1 media_type_id   <NA>       <NA>        
 2 impressions   imp_national_tv       2 data_stream_id  <NA>       <NA>        
 3 impressions   imp_national_tv       3 distributor_id  <NA>       <NA>        
 4 impressions   imp_national_tv       4 hispanic_flag   <NA>       <NA>        
 5 impressions   imp_national_tv       5 impression_type <NA>       <NA>        
 6 impressions   imp_national_tv       6 nielsen_progra… <NA>       <NA>        
 7 impressions   imp_national_tv       7 telecast_number <NA>       <NA>        
 8 impressions   imp_national_tv       8 impression_date <NA>       <NA>        
 9 impressions   imp_national_tv       9 time_interval_… <NA>       <NA>        
10 impressions   imp_national_tv      10 tvhh            <NA>       <NA>        
# ℹ 505 more rows
# ℹ 4 more variables: sql_precision <chr>, sql_scale <chr>, datatype_man <chr>,
#   datatype_std <chr>

$manual_info
# A tibble: 261 × 5
   file_type_std file_name_std     col_name_man  datatype_man description       
   <chr>         <chr>             <chr>         <chr>        <chr>             
 1 impressions   imp_national_tv   <NA>          <NA>         <NA>              
 2 impressions   imp_spot_radio    <NA>          <NA>         <NA>              
 3 impressions   imp_spot_tv       <NA>          <NA>         <NA>              
 4 market_breaks imp_market_breaks <NA>          <NA>         <NA>              
 5 occurrences   cinema            AdDate        Character    Actual date of oc…
 6 occurrences   cinema            MarketCode    Character    Code of Market, a…
 7 occurrences   cinema            MediaTypeID   SmallInt     Code of Media Typ…
 8 occurrences   cinema            PrimBrandCode Integer      Code of Primary B…
 9 occurrences   cinema            ScndBrandCode Integer      Code of Secondary…
10 occurrences   cinema            TerBrandCode  Integer      Code of Tertiary …
# ℹ 251 more rows
Code
data_info_nested <- tibble(
  info_type = names(data_info_list),
  info = data_info_list
)
data_info_nested
Code
usethis::use_data(data_info_list, overwrite = T)
✔ Setting active project to '/home/filippo/Documents/r_wd/adtabler'
✔ Saving 'data_info_list' to 'data/data_info_list.rda'
• Document your data (see 'https://r-pkgs.org/data.html')
Code
usethis::use_data(data_info_nested, overwrite = T)
✔ Saving 'data_info_nested' to 'data/data_info_nested.rda'
• Document your data (see 'https://r-pkgs.org/data.html')